#' ---
#' title: Reproduce Analyses in Section 3.4 (party name merge)
#' author: Joe Ornstein
#' date: 2025-07-06
#' version: 0.2
#' ---

rm(list=ls())
cat('\n\n**Application 4: Multilingual Record Linkage**\n\n')

library(tidyverse)
library(tinytable)

## Reproduce Table 3 ------------------------------

elections <- read_csv('raw/parlgov_elections.csv', progress = FALSE) |>
  # remove the English-speaking countries
  filter(!(country_name %in% c('Australia', 'New Zealand', 'Canada', 'United Kingdom', 'Ireland'))) |>
  # keep just the seats assigned to parties
  filter(!(party_name_english %in% c('one seat', 'one-seat', 'no seat', 'others', 'ethnic', 'no party affiliation'))) |>
  # just keep parliamentary elections (not European Parliament)
  filter(election_type == 'parliament') |>
  select(country_name, election_date, party_name_english, party_name, seats, left_right)

# split into two datasets
english_names <- elections |>
  select(country_name, election_date, party_name = party_name_english, left_right)

native_names <- elections |>
  select(country_name, election_date, party_name, seats)

tb_3a <- native_names |>
  slice(1:4, nrow(native_names))

tb_3a |>
  tt() |>
  save_tt('tables/table3a.tex', overwrite = TRUE)

tb_3b <- english_names |>
  slice(1:4, nrow(english_names))

tb_3b |>
  tt() |>
  save_tt('tables/table3b.tex', overwrite = TRUE)

## Reproduce Figure 1 --------------------------

model <- 'gpt-4o-2024-11-20'
fmla <- match ~ sim + jw
d <- list()
i <- 1
for(f in list.files(paste0('data/parties-merge/', model, '/', deparse(fmla), '/'),
                    pattern = '\\.RData$',
                    full.names = TRUE)){
  load(f)
  d[[i]] <- df
  i <- i + 1
}
df <- bind_rows(d)

elections <- elections |>
  # keep only the parties that won seats
  filter(!is.na(seats), seats > 0)

actual <- elections |>
  # compute seat-share weighted LR of parliament
  group_by(country_name, election_date) |>
  summarize(lr_actual = weighted.mean(left_right, seats, na.rm = TRUE))

estimate <- df |>
  # for each party, estimate their lr score based on match probability
  group_by(A, country_name, election_date) |>
  summarize(seats = unique(seats),
            lr_hat = weighted.mean(left_right, match_probability, na.rm = TRUE)) |>
  # estimate seat-share weighted LR of parliament
  group_by(country_name, election_date) |>
  filter(!is.na(seats)) |>
  summarize(lr_estimated = weighted.mean(lr_hat, seats, na.rm = TRUE)) |>
  ungroup()

parliaments <- left_join(actual, estimate)

fig1 <- parliaments |>
  ggplot() +
  geom_point(mapping = aes(x=election_date, y=lr_estimated)) +
  # geom_line(mapping = aes(x=election_date, y=lr_estimated)) +
  geom_line(mapping = aes(x=election_date, y=lr_actual)) +
  facet_wrap(~country_name, scales = 'free', ncol = 5) +
  theme_bw() +
  labs(x = 'Election Date', y = 'Seat Share-Weighted Ideology of Parliament')

ggsave(filename = 'figures/fig1.png',
       plot = fig1,
       width = 8, height = 8)

cat('Correlation (seat-weighted ideology):',
    cor(parliaments$lr_estimated, parliaments$lr_actual, use = 'pairwise.complete.obs'),
    '\n')

## Reproduce Figure A8 -------------------------------------

fig_a8 <- parliaments |>
  mutate(abs_error = abs(lr_estimated - lr_actual)) |>
  ggplot(mapping = aes(x=lr_actual, y=lr_estimated, color = abs_error > 1)) +
  geom_point(alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1, linetype = 'dashed') +
  scale_x_continuous(limits = c(3.5,7.5)) +
  scale_y_continuous(limits = c(3.5,7.5)) +
  coord_equal() +
  theme_bw() +
  labs(x = 'Seat-Weighted Ideology (Actual)',
       y = 'Seat-Weighted Ideology (Estimated)') +
  scale_color_manual(values = c('black', 'red')) +
  theme(legend.position = 'none')

ggsave(filename = 'figures/fig_a8.png',
       plot = fig_a8,
       width = 8, height = 5)

## Reproduce Table A4 --------------------

# precision = true positives / (true positives + false positives)
precision <- df |>
  filter(!is.na(B)) |>
  filter(match_probability > 0.5 | match == 'Yes') |>
  select(country_name,
         election_date,
         party_name = A,
         party_name_english = B,
         match_probability, match) |>
  left_join(elections |> mutate(true_match = 1)) |>
  mutate(true_match = replace_na(true_match, 0))

precision_by_country <- precision |>
  group_by(country_name) |>
  summarize(precision = mean(true_match) * 100)

cat('Precision (fuzzylink):', round( mean(precision$true_match) * 100, 1 ), '%\n')

# recall = true positives / (true positives + false negatives)
recall <- elections |>
  # keep only the countries we've fuzzylinked
  filter(country_name %in% df$country_name) |>
  # merge with the identified matches
  left_join(
    df |>
      filter(!is.na(B)) |>
      filter(match_probability > 0.5 | match == 'Yes') |>
      select(country_name,
             election_date,
             party_name = A,
             party_name_english = B,
             match_probability, match) |>
      mutate(identified_match = 1)
  ) |>
  mutate(identified_match = replace_na(identified_match, 0))

cat('Recall (fuzzylink):', round( mean(recall$identified_match) * 100, 1), '%\n')

recall_by_country <- recall |>
  group_by(country_name) |>
  summarize(recall = mean(identified_match) * 100)

evaluation_metrics <- left_join(precision_by_country,
                                recall_by_country)

evaluation_metrics |>
  select(`Country` = country_name,
         `Precision` = precision,
         `Recall` = recall) |>
  xtable::xtable(type = 'latex',
                 caption = 'Precision and Recall for Multilingual Record Linkage Application By Country',
                 label = 'tbl:parlgov-evaluation') |>
  print(file = 'tables/table_a4.tex')
